library(tidyverse)
## Warning: パッケージ 'tidyverse' はバージョン 4.3.1 の R の下で造られました
## Warning: パッケージ 'purrr' はバージョン 4.3.3 の R の下で造られました
## Warning: パッケージ 'dplyr' はバージョン 4.3.2 の R の下で造られました
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)
library(purrr)
library(stringr)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

#データ整理と変換

##(a)Semester Dataの整形

#csvファイルの読み込み。デフォルトで一行目が列名として扱われる。
semester1 <- read_csv("semester_data_1.csv")
## Rows: 6632 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (6): x1, x2, x3, x4, x5, x6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
semester2 <- read_csv("semester_data_2.csv")
## Rows: 7258 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): x2
## dbl (5): x1, x3, x4, x5, x6
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#列ごとの型を確認する。1と2で型が違うため揃える必要があると判断。
str(semester1)
## spc_tbl_ [6,632 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ x1: chr [1:6632] "unitid" "100654" "100654" "100654" ...
##  $ x2: chr [1:6632] "instnm" "ALABAMA A&M UNIVERSITY" "ALABAMA A&M UNIVERSITY" "ALABAMA A & M UNIVERSITY" ...
##  $ x3: chr [1:6632] "semester" "1" "1" "1" ...
##  $ x4: chr [1:6632] "quarter" "0" "0" "0" ...
##  $ x5: chr [1:6632] "year" "1991" "1992" "1993" ...
##  $ x6: chr [1:6632] "Y" "1991" "1992" "1993" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   x1 = col_character(),
##   ..   x2 = col_character(),
##   ..   x3 = col_character(),
##   ..   x4 = col_character(),
##   ..   x5 = col_character(),
##   ..   x6 = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>
str(semester2)
## spc_tbl_ [7,258 × 6] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ x1: num [1:7258] 182634 182634 182634 182634 182634 ...
##  $ x2: chr [1:7258] "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" "COLBY-SAWYER COLLEGE" ...
##  $ x3: num [1:7258] 1 1 1 1 1 1 1 1 1 1 ...
##  $ x4: num [1:7258] 0 0 0 0 0 0 0 0 0 0 ...
##  $ x5: num [1:7258] 1991 1992 1993 1995 1996 ...
##  $ x6: num [1:7258] 1991 1992 1993 1995 1996 ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   x1 = col_double(),
##   ..   x2 = col_character(),
##   ..   x3 = col_double(),
##   ..   x4 = col_double(),
##   ..   x5 = col_double(),
##   ..   x6 = col_double()
##   .. )
##  - attr(*, "problems")=<externalptr>
#型を揃える。character型に統一したほうが良いと判断。
semester2 <- semester2 %>% mutate(across(-x2, as.character))
#結合。
semester_bind <- bind_rows(semester1,semester2)
#一行目を取り除き、二行目を列名にする。
semester_bind <- semester_bind %>%
  slice(-1) %>%
  set_names(as.character(semester_bind[1, ]))
#Y列を削除。
semester_bind <- semester_bind %>% select(-c("Y"))
# semester制が導入された年の列を作成
semester_bind <- semester_bind %>%
  group_by(unitid) %>% 
  mutate(
    yearsem = ifelse(
      any(semester == 0) & any(semester == 1),
      unique(year[semester == 1 & lag(semester) == 0]),
      NA
    )
  ) %>%
  ungroup()
#semester制導入後を示すafter列を作成
semester_bind <- semester_bind %>% 
  mutate(
    after = ifelse(
      year >= yearsem,
      1,
      0
    )
  )
DT::datatable(semester_bind)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

(b)Gradrate Dataの整形

#1994年以外の年の.xlsファイルがあるため、paste0()でディレクトリをまとめてしまう
available_years <- setdiff(1991:2014, c(1994))
file_paths <- paste0("C:/Users/nasuk/OneDrive/デスクトップ/ra_bootcamp_warmp/", available_years, ".xlsx")
#まとめたディレクトリを用いて、一度に結合する
gradrate_bind <- file_paths %>%
  map(~ read_excel(.x)) %>%
  bind_rows()
#女子学生の4年卒業率に0.01をかけて、0から1のスケールに変更
gradrate_bind <- gradrate_bind %>%
  mutate(womengradrate4yr = women_gradrate_4yr * 0.01)
#データ型を確認。num型に揃える必要があるため、変換する。
str(gradrate_bind)
## tibble [18,883 × 10] (S3: tbl_df/tbl/data.frame)
##  $ unitid            : num [1:18883] 100654 100663 100751 100858 101435 ...
##  $ year              : num [1:18883] 1991 1991 1991 1991 1991 ...
##  $ totcohortsize     : chr [1:18883] "1010" "937" "2511" "3024" ...
##  $ w_cohortsize      : num [1:18883] 527 500 1348 1496 101 ...
##  $ m_cohortsize      : num [1:18883] 483 437 1163 1528 88 ...
##  $ tot4yrgrads       : num [1:18883] 152 82 630 846 60 109 73 66 217 117 ...
##  $ m_4yrgrads        : chr [1:18883] "32" "33" "213" "312" ...
##  $ w_4yrgrads        : num [1:18883] 120 49 417 534 35 67 32 43 128 85 ...
##  $ women_gradrate_4yr: num [1:18883] 22.8 9.8 30.9 35.7 34.6 ...
##  $ womengradrate4yr  : num [1:18883] 0.228 0.098 0.309 0.357 0.346 ...
gradrate_bind <- gradrate_bind %>% 
  mutate(
    totcohortsize = as.numeric(totcohortsize), 
    m_4yrgrads = as.numeric(m_4yrgrads)
  )
#男性と全体の卒業率を計算
gradrate_bind <- gradrate_bind %>%
  mutate(totgradrate4yr = tot4yrgrads / totcohortsize)
  
gradrate_bind <- gradrate_bind %>%
  mutate(mengradrate4yr = m_4yrgrads / m_cohortsize)
#有効数字3桁を指定。DTで見るとところどころ有効係数が変わっているようにも見えるが…
gradrate_bind <- gradrate_bind %>%
  mutate(
    totgradrate4yr = sprintf("%.3g", totgradrate4yr),
    mengradrate4yr = sprintf("%.3g", mengradrate4yr)
  )
#2010年までの行のみ抽出
gradrate_bind <- gradrate_bind %>%
  filter(year <= 2010)
DT::datatable(gradrate_bind)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

##(c)Covariates Dataの整形

#生データ読み込み
covariates <- read_excel("C:/Users/nasuk/OneDrive/デスクトップ/ra_bootcamp_warmp/covariates.xlsx")
#列名の変更
covariates <- covariates %>% rename(unitid = university_id)
#"aaaa"の削除
covariates$unitid <- covariates$unitid %>% str_remove_all("aaaa")
#wide型に変換
covariates <- covariates %>% pivot_wider(names_from = category, values_from = value)

outcome 1991-2016 1994なし 1991-2010にフィルター semester 1991-2010

#年の範囲を揃える
covariates <- covariates %>% filter(year>=1991, year<=2010, year != 1994)
#gradrateにおけるunitidの一覧を作る
unitids_vector <- unique(gradrate_bind$unitid)
#gradrateと一致するunitidのみ抽出
covariates <- covariates %>% filter(unitid %in% unitids_vector)
DT::datatable(covariates)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html
#列の結合に用いるunitidとyearの型が一致せず。gradrate_bindのみnumなのでcharに揃える。
str(semester_bind)
## tibble [13,889 × 7] (S3: tbl_df/tbl/data.frame)
##  $ unitid  : chr [1:13889] "100654" "100654" "100654" "100654" ...
##  $ instnm  : chr [1:13889] "ALABAMA A&M UNIVERSITY" "ALABAMA A&M UNIVERSITY" "ALABAMA A & M UNIVERSITY" "ALABAMA A & M UNIVERSITY" ...
##  $ semester: chr [1:13889] "1" "1" "1" "1" ...
##  $ quarter : chr [1:13889] "0" "0" "0" "0" ...
##  $ year    : chr [1:13889] "1991" "1992" "1993" "1995" ...
##  $ yearsem : chr [1:13889] NA NA NA NA ...
##  $ after   : num [1:13889] NA NA NA NA NA NA NA NA NA NA ...
str(gradrate_bind)
## tibble [13,889 × 12] (S3: tbl_df/tbl/data.frame)
##  $ unitid            : num [1:13889] 100654 100663 100751 100858 101435 ...
##  $ year              : num [1:13889] 1991 1991 1991 1991 1991 ...
##  $ totcohortsize     : num [1:13889] 1010 937 2511 3024 189 ...
##  $ w_cohortsize      : num [1:13889] 527 500 1348 1496 101 ...
##  $ m_cohortsize      : num [1:13889] 483 437 1163 1528 88 ...
##  $ tot4yrgrads       : num [1:13889] 152 82 630 846 60 109 73 66 217 117 ...
##  $ m_4yrgrads        : num [1:13889] 32 33 213 312 25 42 41 23 89 32 ...
##  $ w_4yrgrads        : num [1:13889] 120 49 417 534 35 67 32 43 128 85 ...
##  $ women_gradrate_4yr: num [1:13889] 22.8 9.8 30.9 35.7 34.6 ...
##  $ womengradrate4yr  : num [1:13889] 0.228 0.098 0.309 0.357 0.346 ...
##  $ totgradrate4yr    : chr [1:13889] "0.15" "0.0875" "0.251" "0.28" ...
##  $ mengradrate4yr    : chr [1:13889] "0.0663" "0.0755" "0.183" "0.204" ...
str(covariates)
## tibble [13,889 × 6] (S3: tbl_df/tbl/data.frame)
##  $ unitid          : chr [1:13889] "100654" "100654" "100654" "100654" ...
##  $ year            : chr [1:13889] "1991" "1992" "1993" "1995" ...
##  $ instatetuition  : chr [1:13889] "1298" "1600" "1600" "2022" ...
##  $ costs           : chr [1:13889] "53.121007" "52.536624" "50.39972" "68.024267" ...
##  $ faculty         : chr [1:13889] "223" "267" "262" "298" ...
##  $ white_cohortsize: chr [1:13889] "11" "5" "7" "18" ...
gradrate_bind <- gradrate_bind %>% 
  mutate(
    unitid = as.character(unitid),
    year = as.character(year)
  )
#unitidとyearで結合してマスターデータ作成
master <- semester_bind %>%
  left_join(gradrate_bind, by =c("unitid","year")) %>%
  left_join(covariates, by =c("unitid","year") )
DT::datatable(master)
## Warning in instance$preRenderHook(instance): It seems your data is too big for
## client-side DataTables. You may consider server-side processing:
## https://rstudio.github.io/DT/server.html

#分析

##(a)記述統計

##(b)回帰分析

model <- lm(totgradrate4yr ~ after, data=master)
stargazer(model)
## 
## % Table created by stargazer v.5.2.3 by Marek Hlavac, Social Policy Institute. E-mail: marek.hlavac at gmail.com
## % Date and time: 日, 8 18, 2024 - 21:31:03
## \begin{table}[!htbp] \centering 
##   \caption{} 
##   \label{} 
## \begin{tabular}{@{\extracolsep{5pt}}lc} 
## \\[-1.8ex]\hline 
## \hline \\[-1.8ex] 
##  & \multicolumn{1}{c}{\textit{Dependent variable:}} \\ 
## \cline{2-2} 
## \\[-1.8ex] & totgradrate4yr \\ 
## \hline \\[-1.8ex] 
##  after & 0.031$^{**}$ \\ 
##   & (0.012) \\ 
##   & \\ 
##  Constant & 0.251$^{***}$ \\ 
##   & (0.010) \\ 
##   & \\ 
## \hline \\[-1.8ex] 
## Observations & 1,045 \\ 
## R$^{2}$ & 0.006 \\ 
## Adjusted R$^{2}$ & 0.005 \\ 
## Residual Std. Error & 0.183 (df = 1043) \\ 
## F Statistic & 6.429$^{**}$ (df = 1; 1043) \\ 
## \hline 
## \hline \\[-1.8ex] 
## \textit{Note:}  & \multicolumn{1}{r}{$^{*}$p$<$0.1; $^{**}$p$<$0.05; $^{***}$p$<$0.01} \\ 
## \end{tabular} 
## \end{table}